library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(viridis)
## Loading required package: viridisLite
library(stringr)

casestudy2 = read.csv(".//CaseStudy2-data.csv") #casestudy2-data.csv

#Attrition by Department
ggplot(casestudy2, aes(x=as.factor(Department), fill=Attrition))+
  geom_bar(aes( y=..count../tapply(..count.., ..x.. ,sum)[..x..]), position="stack" , width=0.5) +
  geom_text(aes( y=..count../tapply(..count.., ..x.. ,sum)[..x..], label=scales::percent(..count../tapply(..count.., ..x.. ,sum)[..x..]) ),
            stat="count", position=position_stack(0.9), vjust=0.5)+
  xlab('Department') +
  ylab('Percent of Attrition')+
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
  theme(axis.text = element_text(size = 7))

casestudy2$Attritioncalc=case_when(
  casestudy2$Attrition =='Yes'  ~ 1,
   TRUE ~ 0
)

#summary
er<-casestudy2 %>% group_by(Department) %>% summarize(meanincome = mean(MonthlyIncome), calcAttrition = (sum(Attritioncalc)/n()), Employees = n()) %>% arrange(desc(Employees))
er 
## # A tibble: 3 × 4
##   Department             meanincome calcAttrition Employees
##   <chr>                       <dbl>         <dbl>     <int>
## 1 Research & Development      6173.         0.133       562
## 2 Sales                       6789.         0.216       273
## 3 Human Resources             6776.         0.171        35
#Attrition % by Job Role and Department
#summary table
er<-casestudy2 %>% group_by(Department, JobRole) %>% summarize(meanincome = mean(MonthlyIncome), calcAttrition = (sum(Attritioncalc)/n()), Employees = n()) %>% arrange(desc(Employees))
## `summarise()` has grouped output by 'Department'. You can override using the `.groups` argument.
er 
## # A tibble: 11 × 5
## # Groups:   Department [3]
##    Department             JobRole             meanincome calcAttrition Employees
##    <chr>                  <chr>                    <dbl>         <dbl>     <int>
##  1 Sales                  Sales Executive          6892.        0.165        200
##  2 Research & Development Research Scientist       3259.        0.186        172
##  3 Research & Development Laboratory Technic…      3222.        0.196        153
##  4 Research & Development Manufacturing Dire…      7505.        0.0230        87
##  5 Research & Development Healthcare Represe…      7435.        0.105         76
##  6 Sales                  Sales Representati…      2653.        0.453         53
##  7 Research & Development Research Director       15750.        0.0196        51
##  8 Human Resources        Human Resources          3285.        0.222         27
##  9 Research & Development Manager                 17139.        0.0870        23
## 10 Sales                  Manager                 16719.        0.1           20
## 11 Human Resources        Manager                 18560         0              8
#graph
ggplot() +
  geom_polygon(data =  er, aes(x=Department, y = JobRole),color = "white", fill="grey", alpha=0.5) +
  geom_point( data=er, aes(x=Department, y=JobRole, color=Employees, size=calcAttrition, alpha=0.5)) +
  scale_color_viridis(option="viridis",  name="Employees" ) +
  scale_size(range = c(1, 10), name="Attrition %")+
  ggtitle("Attrition by Role")

#Attrition % by Job Level and Department
#summary
er<-casestudy2 %>% group_by(Department, JobLevel) %>% summarize(meanincome = mean(MonthlyIncome), calcAttrition = (sum(Attritioncalc)/n()), Employees = n()) %>% arrange(desc(Employees))
## `summarise()` has grouped output by 'Department'. You can override using the `.groups` argument.
er 
## # A tibble: 15 × 5
## # Groups:   Department [3]
##    Department             JobLevel meanincome calcAttrition Employees
##    <chr>                     <int>      <dbl>         <dbl>     <int>
##  1 Research & Development        1      2793.        0.219        256
##  2 Research & Development        2      5435.        0.0542       166
##  3 Sales                         2      5678.        0.146        144
##  4 Research & Development        3     10248.        0.0909        77
##  5 Sales                         3      9331.        0.189         53
##  6 Sales                         1      2519.        0.48          50
##  7 Research & Development        4     15374.        0.0256        39
##  8 Research & Development        5     19304.        0.0833        24
##  9 Human Resources               1      2691.        0.261         23
## 10 Sales                         4     14863.        0.105         19
## 11 Sales                         5     18965.        0.286          7
## 12 Human Resources               5     19207.        0              6
## 13 Human Resources               2      4982.        0              2
## 14 Human Resources               3      8412.        0              2
## 15 Human Resources               4     16618         0              2
#graph
ggplot() +
  geom_point( data=er, aes(x=Department, y=JobLevel, color=Employees, size=calcAttrition,  alpha=0.5)) +
  scale_color_viridis(option="viridis",  name="Employees" ) +
  scale_size(range = c(1, 10), name="Attrition %")+
  ggtitle("Attrition by Job Level")

#summary
er<-casestudy2 %>% group_by(Department, JobLevel) %>% summarize(meanincome = mean(MonthlyIncome), calcAttrition = (sum(Attritioncalc)/n()), Employees = n()) %>% arrange(desc(Employees))
## `summarise()` has grouped output by 'Department'. You can override using the `.groups` argument.
er 
## # A tibble: 15 × 5
## # Groups:   Department [3]
##    Department             JobLevel meanincome calcAttrition Employees
##    <chr>                     <int>      <dbl>         <dbl>     <int>
##  1 Research & Development        1      2793.        0.219        256
##  2 Research & Development        2      5435.        0.0542       166
##  3 Sales                         2      5678.        0.146        144
##  4 Research & Development        3     10248.        0.0909        77
##  5 Sales                         3      9331.        0.189         53
##  6 Sales                         1      2519.        0.48          50
##  7 Research & Development        4     15374.        0.0256        39
##  8 Research & Development        5     19304.        0.0833        24
##  9 Human Resources               1      2691.        0.261         23
## 10 Sales                         4     14863.        0.105         19
## 11 Sales                         5     18965.        0.286          7
## 12 Human Resources               5     19207.        0              6
## 13 Human Resources               2      4982.        0              2
## 14 Human Resources               3      8412.        0              2
## 15 Human Resources               4     16618         0              2
#graph
ggplot() +
  geom_point( data=er, aes(x=Department, y=JobLevel, color=Employees, size=calcAttrition,  alpha=0.5)) +
  scale_color_viridis(option="viridis",  name="Employees" ) +
  scale_size(range = c(1, 10), name="Attrition %")+
  ggtitle("Attrition by Job Level")

### By Department ##### The Sales Department has the highest % of Attrition (21.6%) ##### 54% of Attrition comes from Research and Development, but Research and Development is the largest Department.

By Job Level

Overall Job Level 1 has the highest % of Attrition
Within the Sales Department– JobLevel 5 has higher attrition % (29%) than Job Level 1 in other departments.

By Job Role

68% of attrition is from 3 Job Roles:

-Sales Executive (Sales)

-Research Scientist (Research & Development)

-Laboratory Technician (Research & Development)

## Loading required package: lattice
library(e1071)

clean_casestudy2 = data.frame(   
  Attrition=casestudy2$Attrition,
  Age = scale(casestudy2$Age), 
  JobInvolvement=scale(casestudy2$JobInvolvement),
  JobLevel=scale(casestudy2$JobLevel),
  Distance=scale(casestudy2$DistanceFromHome),
  StockOptionLevel=scale(casestudy2$StockOptionLevel),
  EnvironmentSatisfaction=scale(casestudy2$EnvironmentSatisfaction),
  RelationshipSatisfaction=scale(casestudy2$RelationshipSatisfaction),
  JobSatisfaction=scale(casestudy2$JobSatisfaction),
  YearSinceLastPromotion=scale(casestudy2$YearsSinceLastPromotion),
  YearsinCurrentRole=scale(casestudy2$YearsInCurrentRole),
  Education=scale(casestudy2$Education)
)
casestudy2.noatt= data.frame(    
  Attrition=casestudy2.noattrition$Attrition,
  Age = scale(casestudy2.noattrition$Age), 
  JobInvolvement=scale(casestudy2.noattrition$JobInvolvement),
  JobLevel=scale(casestudy2.noattrition$JobLevel),
  Distance=scale(casestudy2.noattrition$DistanceFromHome),
  StockOptionLevel=scale(casestudy2.noattrition$StockOptionLevel),
  EnvironmentSatisfaction=scale(casestudy2.noattrition$EnvironmentSatisfaction),
  RelationshipSatisfaction=scale(casestudy2.noattrition$RelationshipSatisfaction),
  JobSatisfaction=scale(casestudy2.noattrition$JobSatisfaction),
  YearSinceLastPromotion=scale(casestudy2.noattrition$YearsSinceLastPromotion),
  YearsinCurrentRole=scale(casestudy2.noattrition$YearsInCurrentRole),
  Education=scale(casestudy2.noattrition$Education))
      
test= casestudy2.noatt
train=clean_casestudy2

#confusion matrix results for each row
numks = 30

masterAcc = matrix(nrow = numks)
masterSens = matrix(nrow = numks)
masterSpec = matrix(nrow = numks)
masterK = matrix(nrow = numks)

data(attrition)
## Warning in data(attrition): data set 'attrition' not found
##    Accuracy Sensitivity Specificity           K 
##   0.8191111   0.8356097   0.3526761  15.5000000
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.6     ✓ purrr   0.3.4
## ✓ tidyr   1.1.4     ✓ forcats 0.5.1
## ✓ readr   2.1.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()

Attrition

Looking at the 16 closest employees (k=16) produces best results

Model accuracy 83.3%

Specificity 83.7%

Sensitivity 60.0%

Factors for the model were:

-Age

-Job Involvement

-Job Level

-Distance

-Stock Option Level

-Environment Satisfaction

-Relationship Satisfaction

-Job Satisfaction

-Year Since Last Promotion

-Years in Current Role

-Education

## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + YearsInCurrentRole, data = casestudy2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4981.4  -928.0    71.8   693.6  3751.1 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -1769.71     104.03 -17.012   <2e-16 ***
## JobLevel            4034.21      47.78  84.425   <2e-16 ***
## YearsInCurrentRole   -15.72      14.31  -1.099    0.272    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1413 on 867 degrees of freedom
## Multiple R-squared:  0.9057, Adjusted R-squared:  0.9055 
## F-statistic:  4166 on 2 and 867 DF,  p-value: < 2.2e-16
##                          2.5 %      97.5 %
## (Intercept)        -1973.88338 -1565.52861
## JobLevel            3940.42571  4128.00029
## YearsInCurrentRole   -43.81716    12.36847
## `summarise()` has grouped output by 'JobLevel'. You can override using the `.groups` argument.

## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## Warning: 'surface' objects don't have these attributes: 'mode'
## Valid attributes include:
## '_deprecated', 'autocolorscale', 'cauto', 'cmax', 'cmid', 'cmin', 'coloraxis', 'colorbar', 'colorscale', 'connectgaps', 'contours', 'customdata', 'customdatasrc', 'hidesurface', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'lighting', 'lightposition', 'meta', 'metasrc', 'name', 'opacity', 'opacityscale', 'reversescale', 'scene', 'showlegend', 'showscale', 'stream', 'surfacecolor', 'surfacecolorsrc', 'text', 'textsrc', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Loading required package: measures
## 
## Attaching package: 'measures'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## Loading required package: party
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## Attaching package: 'varImp'
## The following object is masked from 'package:caret':
## 
##     varImp

Monthly Income

Used linear regression to predict monthly incoming using:

-Job Level

-Years in Current Role

RMSE(root mean squared error) of $1,413

Attrition Factors

Overall the most important factors for Attrition leveraging Random Forest:

-Job Level

-Job Role

-Years in Current Role